From e435e6c0277a3a8882f52fb1091d4e70b6a6eab6 Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Fri, 14 Mar 2003 13:19:58 +0000 Subject: [PATCH] bitkeeper revision 1.122.5.1 (3e71d6fe7FguR-sT8s7ha1pGTKuYSA) Many files: Sort out interrupt distribution in SMP systems. We now periodically redistribute towrds the most idle processors. There's more sport to be had here though... --- xen/arch/i386/io_apic.c | 96 +++++++++++++++++++++++++++++----- xen/arch/i386/irq.c | 5 ++ xen/arch/i386/process.c | 1 + xen/arch/i386/setup.c | 1 + xen/arch/i386/smpboot.c | 2 + xen/common/schedule.c | 10 ++++ xen/include/asm-i386/hardirq.h | 1 + xen/include/asm-i386/smpboot.h | 20 +++---- xen/include/xeno/sched.h | 4 +- 9 files changed, 116 insertions(+), 24 deletions(-) diff --git a/xen/arch/i386/io_apic.c b/xen/arch/i386/io_apic.c index 6ad37f2399..fbea77e646 100644 --- a/xen/arch/i386/io_apic.c +++ b/xen/arch/i386/io_apic.c @@ -189,6 +189,86 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +static void set_ioapic_affinity (unsigned int irq, unsigned long mask) +{ + unsigned long flags; + + /* + * Only the first 8 bits are valid. + */ + mask = mask << 24; + spin_lock_irqsave(&ioapic_lock, flags); + __DO_ACTION(1, = mask, ) + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#if CONFIG_SMP + +typedef struct { + unsigned int cpu; + unsigned long timestamp; +} ____cacheline_aligned irq_balance_t; + +static irq_balance_t irq_balance[NR_IRQS] __cacheline_aligned + = { [ 0 ... NR_IRQS-1 ] = { 0, 0 } }; + +extern unsigned long irq_affinity [NR_IRQS]; + +#endif + +#define IDLE_ENOUGH(cpu,now) \ + (idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1)) + +#define IRQ_ALLOWED(cpu,allowed_mask) \ + ((1 << cpu) & (allowed_mask)) + +static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction) +{ + int search_idle = 1; + int cpu = curr_cpu; + + goto inside; + + do { + if (unlikely(cpu == curr_cpu)) + search_idle = 0; +inside: + if (direction == 1) { + cpu++; + if (cpu >= smp_num_cpus) + cpu = 0; + } else { + cpu--; + if (cpu == -1) + cpu = smp_num_cpus-1; + } + } while (!IRQ_ALLOWED(cpu,allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu,now))); + + return cpu; +} + +static inline void balance_irq(int irq) +{ +#if CONFIG_SMP + irq_balance_t *entry = irq_balance + irq; + unsigned long now = jiffies; + + if (unlikely(entry->timestamp != now)) { + unsigned long allowed_mask; + int random_number; + + rdtscl(random_number); + random_number &= 1; + + allowed_mask = cpu_online_map & irq_affinity[irq]; + entry->timestamp = now; + entry->cpu = move(entry->cpu, allowed_mask, now, random_number); + set_ioapic_affinity(irq, apicid_to_phys_cpu_present(entry->cpu)); + } +#endif +} + /* * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to * specific CPU-side IRQs. @@ -1233,6 +1313,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq) */ static void ack_edge_ioapic_irq(unsigned int irq) { + balance_irq(irq); if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_DISABLED)) mask_IO_APIC_irq(irq); @@ -1272,6 +1353,8 @@ static void end_level_ioapic_irq (unsigned int irq) unsigned long v; int i; + balance_irq(irq); + /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -1328,19 +1411,6 @@ static void end_level_ioapic_irq (unsigned int irq) static void mask_and_ack_level_ioapic_irq (unsigned int irq) { /* nothing */ } -static void set_ioapic_affinity (unsigned int irq, unsigned long mask) -{ - unsigned long flags; - /* - * Only the first 8 bits are valid. - */ - mask = mask << 24; - - spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = mask, ) - spin_unlock_irqrestore(&ioapic_lock, flags); -} - /* * Level and edge triggered IO-APIC interrupts need different handling, * so we use two separate IRQ descriptors. Edge triggered IRQs can be diff --git a/xen/arch/i386/irq.c b/xen/arch/i386/irq.c index 312cfe7970..e799542b1b 100644 --- a/xen/arch/i386/irq.c +++ b/xen/arch/i386/irq.c @@ -60,6 +60,11 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}}; +#ifdef CONFIG_SMP +/* NB. XXX We'll want some way of fiddling with this from DOM0. */ +unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; +#endif + /* * Special irq handlers. */ diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c index 3c048d72bf..c9736a2093 100644 --- a/xen/arch/i386/process.c +++ b/xen/arch/i386/process.c @@ -85,6 +85,7 @@ void cpu_idle (void) for ( ; ; ) { + irq_stat[cpu].idle_timestamp = jiffies; while (!current->hyp_events && !softirq_pending(cpu)) default_idle(); do_hyp_events(); diff --git a/xen/arch/i386/setup.c b/xen/arch/i386/setup.c index 1e5f35a73e..6b2c380adc 100644 --- a/xen/arch/i386/setup.c +++ b/xen/arch/i386/setup.c @@ -20,6 +20,7 @@ unsigned long wait_init_idle; /* Basic page table for each CPU in the system. */ l2_pgentry_t *idle_pg_table[NR_CPUS] = { idle0_pg_table }; +struct task_struct *idle_task[NR_CPUS] = { &idle0_task }; /* for asm/domain_page.h, map_domain_page() */ unsigned long *mapcache[NR_CPUS]; diff --git a/xen/arch/i386/smpboot.c b/xen/arch/i386/smpboot.c index 0955db82f3..401b8f8020 100644 --- a/xen/arch/i386/smpboot.c +++ b/xen/arch/i386/smpboot.c @@ -699,6 +699,8 @@ static void __init do_boot_cpu (int apicid) SET_DEFAULT_FAST_TRAP(&idle->thread); + idle_task[cpu] = idle; + /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 787b43d900..1c8d751e4d 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -174,6 +174,7 @@ long schedule_timeout(long timeout) } /* RN: XXX turn this into do_halt() */ +/* KAF: No, turn it back into do_yield()! */ /* * yield the current process */ @@ -281,6 +282,15 @@ asmlinkage void schedule(void) return; } + +/* No locking needed -- pointer comparison is safe :-) */ +int idle_cpu(int cpu) +{ + struct task_struct *p = schedule_data[cpu].curr; + return p == idle_task[cpu]; +} + + /* * The scheduling timer. */ diff --git a/xen/include/asm-i386/hardirq.h b/xen/include/asm-i386/hardirq.h index bad529b882..f0a9024dcd 100644 --- a/xen/include/asm-i386/hardirq.h +++ b/xen/include/asm-i386/hardirq.h @@ -10,6 +10,7 @@ typedef struct { unsigned int __local_irq_count; unsigned int __local_bh_count; unsigned int __syscall_count; + unsigned long idle_timestamp; } ____cacheline_aligned irq_cpustat_t; #include /* Standard mappings for irq_cpustat_t above */ diff --git a/xen/include/asm-i386/smpboot.h b/xen/include/asm-i386/smpboot.h index 3ca484d531..4017902c69 100644 --- a/xen/include/asm-i386/smpboot.h +++ b/xen/include/asm-i386/smpboot.h @@ -30,6 +30,15 @@ static inline void detect_clustered_apic(char* oem, char* prod) /*Start cyclone clock*/ cyclone_setup(0); } + else if (!strncmp(oem, "IBM ENSW", 8) && !strncmp(prod, "RUTHLESS SMP", 9)){ + clustered_apic_mode = CLUSTERED_APIC_XAPIC; + apic_broadcast_id = APIC_BROADCAST_ID_XAPIC; + int_dest_addr_mode = APIC_DEST_PHYSICAL; + int_delivery_mode = dest_Fixed; + esr_disable = 1; + /*Start cyclone clock*/ + cyclone_setup(0); + } else if (!strncmp(oem, "IBM NUMA", 8)){ clustered_apic_mode = CLUSTERED_APIC_NUMAQ; apic_broadcast_id = APIC_BROADCAST_ID_APIC; @@ -116,15 +125,6 @@ static inline int target_cpus(void) return cpu_online_map; } #else -/* KAF Xen: Round-robin allocate IRQs to CPUs. */ -static inline int target_cpus(void) -{ - static unsigned int cpu_field = 1; - do { - cpu_field <<= 1; - if ( cpu_field == 0x100 ) cpu_field = 1; /* logical field == 8 bits */ - } while ( (cpu_field & cpu_online_map) == 0 ); - return cpu_field; -} +#define target_cpus() (0xFF) #endif #endif diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h index 6d1842a2ea..49fca609ec 100644 --- a/xen/include/xeno/sched.h +++ b/xen/include/xeno/sched.h @@ -149,6 +149,7 @@ struct task_struct { next_task: &(_t) \ } +extern struct task_struct *idle_task[NR_CPUS]; #define IDLE_DOMAIN_ID (~0) #define is_idle_task(_p) ((_p)->domain == IDLE_DOMAIN_ID) @@ -214,7 +215,8 @@ asmlinkage void schedule(void); void domain_init(void); -void cpu_idle(void); +int idle_cpu(int cpu); /* Is CPU 'cpu' idle right now? */ +void cpu_idle(void); /* Idle loop. */ #define REMOVE_LINKS(p) do { \ (p)->next_task->prev_task = (p)->prev_task; \ -- 2.30.2